#project website. can you see it? https://josiealford14.github.io/therealcostofdrugs/
library(tidyverse)
library(dplyr)
library(rvest)
library(httr)
library(janitor)
library(stringr)
library(forcats)
library(tidytext)
library(viridis)
library(broom)
library(plotly)
# # API code
# url = "https://data.medicaid.gov/resource/tau9-gfwr.csv"
# drug_price =
# GET(url, query = list("$limit" = 9999999)) %>%
# content("parsed") %>%
# clean_names()
# # CSV code
drug_price =
read_csv("../data/drug_price.csv") %>%
clean_names()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## as_of_date = col_date(format = ""),
## classification_for_rate_setting = col_character(),
## corresponding_generic_drug_effective_date = col_character(),
## corresponding_generic_drug_nadac_per_unit = col_character(),
## effective_date = col_date(format = ""),
## explanation_code = col_character(),
## nadac_per_unit = col_double(),
## ndc = col_character(),
## ndc_description = col_character(),
## otc = col_character(),
## pharmacy_type_indicator = col_character(),
## pricing_unit = col_character()
## )
drug_price dataset is huge and takes a while to load.as_of_date: date of data postingclassification_for_rate_setting: indicates whether, for NADAC calculation, the NDC was considered brand (‘B’) or generic (‘G’) or brand (‘B’) approved under an Abbreviated New Drug Application (ANDA) (‘B-ANDA’).corresponding_generic_drug_effecitve_date: effective date of when the Corresponding Generic Drug NADAC Per Unit is assigned to a multiple source brand drug NDCcorresponding_generic_drug_nadac_per_unit: NADAC for the corresponding generic drugeffective_date: effective date of the NADAC Per Unit costexplanation_code: codes that pertain to how the NADAC was calculated. see explanation code descriptions herenadac_per_unit: National Average Drug Acquisition Cost per unitndc: 11-digit FDA code that includes the labeler code, product code, and package codendc_description: drug name, strength, and dosage form of the drugotc: over-the-counter (OTC) product (‘Y’ or ‘N’)pharmacy_type_indicator: source of pharmacy survey data used to calculate the NADAC. ‘C/I’ indicates data was collected from surveys of Chain/Independent pharmacies. Other pharmacy type indicators are not used at this time.pricing_unit: pricing unit for the associated NDC (‘ML’, ‘GM’ or ‘EA’)effective_date, ndc, ndc_description, otc, classification_for_rate_setting, pricing_unit, and nadac_per_unit. We will rename the variables effective_date, ndc_description, classification_for_rate_setting, and nadac_per_unit as date, drug_name, drug_type, and unit_price respectively. We will also split the date variable into the year, month and date variables.drug_price =
drug_price %>%
rename(trade_name = ndc_description,
drug_type = classification_for_rate_setting,
unit_price = nadac_per_unit) %>%
mutate(trade_name = str_replace(trade_name, "[0-9]", "_"),
trade_name = str_to_title(trade_name),
otc = recode(otc, "Y" = "yes", "N" = "no"),
otc = str_to_title(otc),
otc = as.factor(otc)) %>%
separate(trade_name, into = c("trade_name", "remove"), sep = "_") %>%
separate(effective_date, into = c("year", "month", "date")) %>%
select(-c(date, x1, remove)) %>%
mutate(trade_name = trimws(trade_name, which = c("both"))) %>%
separate(trade_name, into = c("trade_name", "remove"), sep = " ") %>%
filter(is.na(remove)) %>%
select(-remove) %>%
filter(pricing_unit == "GM" | pricing_unit == "EA") %>%
select(trade_name, everything())
## Warning: Too few values at 308308 locations: 1, 2, 7, 17, 18, 19, 30, 45,
## 48, 56, 59, 60, 64, 66, 68, 85, 86, 87, 92, 107, ...
## Warning in format.POSIXlt(as.POSIXlt(x), ...): unknown timezone 'zone/tz/
## 2017c.1.0/zoneinfo/America/New_York'
## Warning: Too many values at 592722 locations: 5, 17, 18, 48, 68, 112, 128,
## 136, 138, 173, 196, 215, 222, 231, 235, 241, 265, 273, 279, 294, ...
## Warning: Too few values at 2730099 locations: 1, 3, 4, 6, 8, 9, 10, 11, 12,
## 13, 15, 16, 20, 21, 23, 24, 25, 26, 27, 28, ...
drug_price %>%
filter(trade_name == "Abacavir") %>%
ggplot(aes(x = year, y = unit_price)) + geom_violin(aes(fill = year), color = "blue", alpha = .5)
new_drug_price = drug_price %>%
group_by(trade_name, drug_type, otc, pricing_unit, year) %>%
summarize(avg_price = mean(unit_price, na.rm = TRUE))
new_drug_price
## # A tibble: 6,023 x 6
## # Groups: trade_name, drug_type, otc, pricing_unit [?]
## trade_name drug_type otc pricing_unit year avg_price
## <chr> <chr> <fctr> <chr> <chr> <dbl>
## 1 G Yes EA 2013 0.2434375
## 2 G Yes EA 2014 0.2430778
## 3 G Yes EA 2015 0.2556137
## 4 G Yes EA 2016 0.2538263
## 5 G Yes EA 2017 0.2339103
## 6 G Yes GM 2016 0.3077704
## 7 G Yes GM 2017 0.3087167
## 8 Abacavir G No EA 2013 5.3170850
## 9 Abacavir G No EA 2014 4.7696039
## 10 Abacavir G No EA 2015 3.6822746
## # ... with 6,013 more rows
url = "https://report.nih.gov/categorical_spending.aspx"
NIH_xml = read_html(url)
NIH_funding = (NIH_xml %>% html_nodes(css = "table"))[[3]] %>% html_table() %>% as_tibble()
NIH_funding = NIH_funding %>%
clean_names() %>%
mutate(US_mortality_2015 = x2015_us_mortality_19)
NIH_funding$x2015_us_prevalence_standard_error_19 = recode(NIH_funding$x2015_us_prevalence_standard_error_19 , "-" = "NA")
NIH_funding$US_mortality_2015 = recode(NIH_funding$US_mortality_2015, "-" = "NA")
NIH_funding = NIH_funding %>%
separate(x2015_us_prevalence_standard_error_19, into = c("prevalence", "remove"), sep = "\\%") %>%
select(-x2015_us_mortality_19, -remove)
## Warning: Too many values at 36 locations: 7, 19, 21, 24, 25, 35, 37, 40,
## 46, 48, 54, 59, 74, 80, 81, 89, 93, 114, 115, 117, ...
## Warning: Too few values at 246 locations: 1, 2, 3, 4, 5, 6, 8, 9, 10, 11,
## 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, ...
NIH_funding$fy_2013actual = recode(NIH_funding$fy_2013actual, "+" = "NNA")
NIH_funding$fy_2014actual = recode(NIH_funding$fy_2014actual, "+" = "NNA")
NIH_funding$fy_2015actual = recode(NIH_funding$fy_2015actual, "+" = "NNA")
NIH_funding$fy_2016actual = recode(NIH_funding$fy_2016actual, "+" = "NNA")
NIH_funding = NIH_funding %>%
mutate(fy_2013 = as.numeric(substring(fy_2013actual, 2))) %>%
mutate(fy_2014 = as.numeric(substring(fy_2014actual, 2))) %>%
mutate(fy_2015 = as.numeric(substring(fy_2015actual, 2))) %>%
mutate(fy_2016 = as.numeric(substring(fy_2016actual, 2))) %>%
mutate(fy_2017 = as.numeric(substring(fy_2017estimated_enacted, 2))) %>%
mutate(fy_2018estimated = as.numeric(substring(fy_2018estimated, 2)))
## Warning in evalq(as.numeric(substring(fy_2013actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2014actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2015actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2016actual, 2)), <environment>):
## NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2017estimated_enacted, 2)),
## <environment>): NAs introduced by coercion
## Warning in evalq(as.numeric(substring(fy_2018estimated, 2)),
## <environment>): NAs introduced by coercion
NIH_funding$prevalence = as.numeric(NIH_funding$prevalence)
## Warning: NAs introduced by coercion
NIH_funding$US_mortality_2015 = as.numeric(NIH_funding$US_mortality_2015)
## Warning: NAs introduced by coercion
NIH_funding = NIH_funding %>%
select(-c(fy_2013actual, fy_2014actual, fy_2015actual, fy_2016actual, fy_2017estimated_enacted))
final_NIH_funding = NIH_funding %>%
rename(disease = research_disease_areas_dollars_in_millions_and_rounded,
'2013' = fy_2013,
'2014' = fy_2014,
'2015' = fy_2015,
'2016' = fy_2016,
'2017' = fy_2017,
'2018' = fy_2018estimated) %>%
gather(key = "year", value = funding, c("2013", "2014", "2015", "2016", "2017", "2018")) %>%
separate(disease, into = c("disease", "remove"), sep = " ") %>%
select(-remove)
## Warning: Too many values at 666 locations: 1, 2, 3, 4, 6, 7, 10, 11, 12,
## 24, 28, 30, 38, 50, 52, 53, 54, 55, 56, 58, ...
## Warning: Too few values at 384 locations: 5, 8, 13, 14, 17, 18, 19, 21, 23,
## 25, 32, 33, 39, 40, 44, 47, 66, 71, 73, 84, ...
rm(NIH_funding)
final_NIH_funding
## # A tibble: 1,692 x 5
## disease prevalence US_mortality_2015 year funding
## * <chr> <dbl> <dbl> <chr> <dbl>
## 1 Acquired NA NA 2013 NA
## 2 Acute NA NA 2013 95
## 3 Adolescent NA 138 2013 70
## 4 Agent NA NA 2013 10
## 5 Aging NA NA 2013 NA
## 6 Alcoholism, NA NA 2013 437
## 7 Allergic 8.2 77 2013 9
## 8 ALS NA NA 2013 39
## 9 Alzheimer's NA NA 2013 504
## 10 Alzheimer's NA NA 2013 NA
## # ... with 1,682 more rows
Below is a description of the dataset and variables from https://www.fda.gov/Drugs/InformationOnDrugs/ucm129689.htm
New Drug Application Type: The type of new drug application approval, either New Drug Applications(NDA or innovator) or Abbreviated New Drug Applications(ANDA or generic)
New Drug Application (NDA) Number (appl_no) : The FDA assigned number to the application.
Product Number: The FDA assigned number to identify the application products. Each strength is a separate product. May repeat for multiple part products.
Patent Number (patent_num): Patent numbers as submitted by the applicant holder for patents covered by the statutory provisions. May repeat for multiple applications and multiple products.
Pediatric Exclusivity Granted: Yes or No. Was pediatric exclusivity granted by the agency?
Patent Expire Date: The date the patent expires as submitted by the applicant holder including applicable extensions. This was split into Year, Month, and Day variables
Drug Substance Flag: Patents submitted on FDA Form 3542 and listed after August 18, 2003 may have a drug substance flag indicating the sponsor submitted the patent as claiming the drug substance.
Drug Product Flag: Patents submitted on FDA Form 3542 and listed after August 18, 2003 may have a drug product flag indicating the sponsor submitted the patent as claiming the drug product.
Patent Use Code: Code to designate a use patent that covers the approved indication or use of a drug product. May repeat for multiple applications, multiple products and multiple patents.
Patent Delist Request Flag: Sponsor has requested patent be delisted. This patent has remained listed because, under Section 505(j)(5)(D)(i) of the Act, a first applicant may retain eligibility for 180-day exclusivity based on a paragraph IV certification to this patent for a certain period. Applicants under Section 505(b)(2) are not required to certify to patents where this flag is set to Y.
patents = read_csv("./data/patent.csv")
## Parsed with column specification:
## cols(
## appl_type = col_character(),
## appl_no = col_integer(),
## product_no = col_integer(),
## patent_no = col_character(),
## patexpstr = col_character(),
## patexp = col_character(),
## drugsub = col_character(),
## drugprod = col_character(),
## patuse = col_character(),
## delist = col_character()
## )
patents = patents %>%
clean_names() %>%
rename(drug_type = appl_type) %>%
mutate(drug_type = recode(drug_type, "N" = "B")) %>%
separate(patexp, into = c("month_pat_expires", "day_pat_expires", "year_pat_expires"), sep = "/") %>%
separate(patent_no, into = c("patent_num", "pediatric_exclusivity_granted"), sep = "\\*")
## Warning: Too few values at 12167 locations: 1, 2, 3, 5, 7, 9, 10, 11, 12,
## 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, ...
patents$pediatric_exclusivity_granted[is.na(patents$pediatric_exclusivity_granted)] = "no"
patents$drugsub[is.na(patents$drugsub)] = "no"
patents$drugprod[is.na(patents$drugprod)] = "no"
patents$delist[is.na(patents$delist)] = "no"
patents = patents %>%
mutate(pediatric_exclusivity_granted = factor(pediatric_exclusivity_granted, levels = c("no", "PED"), labels = c("No", "Yes"))) %>%
mutate(drug_substance_flag = factor(drugsub, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
mutate(drug_prod_flag = factor(drugprod, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
mutate(patent_delisted = factor(delist, levels = c("no", "Y"), labels = c("no", "yes"))) %>%
select(-drugsub, -drugprod, -patexpstr) %>%
rename(pat_use_code = patuse)
patents
## # A tibble: 13,858 x 13
## drug_type appl_no product_no patent_num pediatric_exclusivity_granted
## <chr> <int> <int> <chr> <fctr>
## 1 B 22425 1 5223510 No
## 2 B 22425 1 5223510 No
## 3 B 22307 1 5288726 No
## 4 B 22307 1 5288726 Yes
## 5 B 21462 1 5344932 No
## 6 B 21462 1 5344932 Yes
## 7 B 22088 1 5362718 No
## 8 B 22088 1 5362718 Yes
## 9 B 21060 1 5364842 No
## 10 B 21060 1 5364842 No
## # ... with 13,848 more rows, and 8 more variables:
## # month_pat_expires <chr>, day_pat_expires <chr>,
## # year_pat_expires <chr>, pat_use_code <chr>, delist <chr>,
## # drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## # patent_delisted <fctr>
*The Products dataset from the FDA Orange Book
Below is a description of the dataset and variables from https://www.fda.gov/Drugs/InformationOnDrugs/ucm129689.htm
Ingredient: The active ingredient(s) for the product. Multiple ingredients are in alphabetical order, separated by a semicolon.
Dose form;The product dosage form
Route: Product dosage route
Trade_Name: The trade name of the product as shown on the labeling.
Applicant: The firm name holding legal responsibility for the new drug application. The firm name is condensed to a maximum twenty character unique string.
Strength: The potency of the active ingredient. May repeat for multiple part products.
Drug Application Type: The type of new drug application approval. NDA orANDA as for patents dataset.
New Drug Application (NDA) Number: The FDA assigned number to the application.
Product Number: The FDA assigned number to identify the application products. Each strength is a separate product. May repeat for multiple part products.
Therapeutic Equivalence (TE) Code: The TE Code indicates the therapeutic equivalence rating of generic to innovator Rx products.
Approval Date: The date the product was approved as stated in the FDA approval letter to the applicant. Products approved prior to the January 1, 1982 contain the phrase: “Approved prior to Jan 1, 1982”. This separated into Year, Month, and Day variables.
Reference Listed Drug (RLD) Updated: The RLD is a drug product approved under section 505(c) of the FD&C Act for which FDA has made a finding of safety and effectiveness. In the electronic Orange Book, an RLD is identified by “RLD” in the RLD column.
Reference Standard (RS) New! :A “reference standard” is the drug product selected by FDA that an applicant seeking approval of an ANDA must use in conducting an in vivo bioequivalence study required for approval of an ANDA. In the electronic Orange Book, a reference standard is identified by “RS” in the RS column.
*Type: The group or category of approved drugs. RX (prescription), OTC (over the counter), or DISCN (discontinued).
*Applicant Full Name: The full name of the firm holding legal responsibility for the new drug application.
products = read_csv("./data/products.csv")
## Parsed with column specification:
## cols(
## ingredient = col_character(),
## dfroute = col_character(),
## doseform = col_character(),
## route = col_character(),
## trade_name = col_character(),
## applicant = col_character(),
## strength = col_character(),
## appl_type = col_character(),
## appl_no = col_integer(),
## product_no = col_integer(),
## tecodestr = col_character(),
## tecode = col_integer(),
## appdatestr = col_character(),
## appdate = col_character(),
## rld = col_character(),
## rs = col_character(),
## type = col_character(),
## appfull = col_character()
## )
products = products %>%
rename(drug_type = appl_type,
otc = type) %>%
mutate(drug_type = recode(drug_type, "A" = "G", "N" = "B"),
otc = recode(otc, "OTC" = "yes", "RX" = "no", "DISCN" = "no")) %>%
separate(appdate, into = c("approval_month", "approval_day", "approval_year"), sep = "/") %>%
#separate(doseform, into = c("form", "release_type"), sep = ",")
mutate(otc = factor(otc)) %>%
mutate(ref_listed_drug = factor(rld)) %>%
mutate(ref_std = factor(rs)) %>%
select(-appdatestr, -rld, -rs, -dfroute)
products
## # A tibble: 34,209 x 18
## ingredient doseform route
## <chr> <chr> <chr>
## 1 ABACAVIR SULFATE SOLUTION ORAL
## 2 ABACAVIR SULFATE SOLUTION ORAL
## 3 ABACAVIR SULFATE TABLET ORAL
## 4 ABACAVIR SULFATE TABLET ORAL
## 5 ABACAVIR SULFATE TABLET ORAL
## 6 ABACAVIR SULFATE TABLET ORAL
## 7 ABACAVIR SULFATE TABLET ORAL
## 8 ABACAVIR SULFATE TABLET ORAL
## 9 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE TABLET ORAL
## 10 ABACAVIR SULFATE; LAMIVUDINE TABLET ORAL
## # ... with 34,199 more rows, and 15 more variables: trade_name <chr>,
## # applicant <chr>, strength <chr>, drug_type <chr>, appl_no <int>,
## # product_no <int>, tecodestr <chr>, tecode <int>, approval_month <chr>,
## # approval_day <chr>, approval_year <chr>, otc <fctr>, appfull <chr>,
## # ref_listed_drug <fctr>, ref_std <fctr>
We load and tidy the exclusivity dataset
exclusivity = read_csv("./data/exclusivity.csv") %>%
clean_names() %>%
rename(drug_type = appl_type) %>%
mutate(drug_type = recode(drug_type, "A" = "G", "N" = "B")) %>%
separate(excldate, into = c("month_excl_expires", "date_excl_expires", "year_excl_expires"))
## Parsed with column specification:
## cols(
## appl_type = col_character(),
## appl_no = col_integer(),
## product_no = col_integer(),
## exclcode = col_character(),
## excldatestr = col_character(),
## excldate = col_character()
## )
exclusivity
## # A tibble: 1,806 x 8
## drug_type appl_no product_no exclcode excldatestr month_excl_expires
## * <chr> <int> <int> <chr> <chr> <chr>
## 1 G 65488 1 PC Mar 26, 2018 3
## 2 G 65488 2 PC Mar 26, 2018 3
## 3 G 78276 1 PC Apr 24, 2017 4
## 4 G 78276 2 PC Apr 24, 2017 4
## 5 G 78276 3 PC Apr 24, 2017 4
## 6 G 78340 1 PC Jul 30, 2016 7
## 7 G 78340 2 PC Jul 30, 2016 7
## 8 G 78560 1 PC Jun 10, 2017 6
## 9 G 78827 1 PC Apr 24, 2017 4
## 10 G 78827 2 PC Apr 24, 2017 4
## # ... with 1,796 more rows, and 2 more variables: date_excl_expires <chr>,
## # year_excl_expires <chr>
The tidied exclusivity dataset has 1806 observations and 8 variables:
year, month, and date the exclusivity expiresappl_type: type of new drug application approval. New Drug Applications (NDA or innovator) are “N”. Abbreviated New Drug Applications (ANDA or generic) are “A”.appl_no: FDA assigned number to the application. Format is nnnnnn.product_no: FDA assigned number to identify the application products. Each strength is a separate product. May repeat for multiple part products. Format is nnn.exclcode: Code to designate exclusivity granted by the FDA to a drug product. Format is nnnnnnnnnn.excldatestr: The date the exclusivity expires. Format is MMM DD, YYYY.We load the ingredient dataset.
ingredient = read_csv("./data/ingredient.csv") %>%
clean_names() %>%
select(singnum, singredient, everything())
## Parsed with column specification:
## cols(
## ingredient = col_character(),
## singnum = col_integer(),
## singredient = col_character()
## )
ingredient
## # A tibble: 3,371 x 3
## singnum singredient
## <int> <chr>
## 1 1 ABACAVIR SULFATE
## 2 1 ABACAVIR SULFATE
## 3 2 DOLUTEGRAVIR SODIUM
## 4 3 LAMIVUDINE
## 5 1 ABACAVIR SULFATE
## 6 2 LAMIVUDINE
## 7 1 ABACAVIR SULFATE
## 8 2 LAMIVUDINE
## 9 3 ZIDOVUDINE
## 10 1 ABALOPARATIDE
## # ... with 3,361 more rows, and 1 more variables: ingredient <chr>
The tidied ingredient dataset has 3371 observations and 3 variables:
singnum: The number assigned to the single ingredient for each product.singredient: each single ingredient in the product.ingredient: The active ingredient(s) for the product. Multiple ingredients are in alphabetical order, separated by a semicolon.Scraping data on drugs per condition from this website
urls = paste("https://www.centerwatch.com/drug-information/fda-approved-drugs/medical-conditions/", toupper(letters), sep = "")
get_drug_disease = function(url) {
disease_data = read_html(url) %>%
html_nodes(css = ".ToggleDrugCategory") %>%
html_text() %>%
as_tibble() %>%
mutate(key = value) %>%
rename(disease = value)
drug_data = read_html(url) %>%
html_nodes(css = ".CategoryListSection a:nth-child(1)") %>%
html_text() %>%
as_tibble() %>%
mutate(key = value) %>%
rename(drug = value)
drug_disease_data = read_html(url) %>%
html_nodes(css = "#MainColumn a:nth-child(1)") %>%
html_text() %>%
as_tibble() %>%
mutate(key = value) %>%
rename(drug_disease = value)
drug_disease_joined = left_join(drug_disease_data, drug_data, disease_data, by = "key") %>%
left_join(., disease_data, by = "key") %>%
fill(disease) %>%
select(-c(drug,drug_disease))
toremove = drug_disease_joined %>% distinct(disease) %>% pull()
drug_disease_joined = drug_disease_joined %>%
filter(!key %in% toremove) %>%
rename(drug = key) %>%
.[-c(1:29),]
drug_disease_joined
}
drug_disease_data = map_df(urls, get_drug_disease) %>%
mutate(disease = str_to_title(disease),
drug = str_to_title(drug)) %>%
rename(trade_name = drug) %>%
mutate(trade_name = trimws(trade_name, which = c("both"))) %>%
separate(trade_name, into = c("trade_name", "remove"), sep = " ")
## Warning: Too many values at 959 locations: 2, 3, 4, 5, 6, 8, 9, 10, 11, 12,
## 13, 14, 16, 18, 19, 21, 23, 24, 25, 26, ...
## Warning: Too few values at 139 locations: 7, 64, 68, 76, 109, 122, 132,
## 144, 145, 161, 162, 173, 174, 188, 197, 207, 221, 223, 228, 231, ...
drug_disease_data
## # A tibble: 1,881 x 3
## trade_name remove disease
## * <chr> <chr> <chr>
## 1 Avita Gel Acne
## 2 Benzamycin (Erythromycin Acne
## 3 Clindamycin Phosphate Acne
## 4 Clindamycin Phosphate Acne
## 5 Differin (Adapalene Acne
## 6 Estrostep (Norethindrone Acne
## 7 Finevin <NA> Acne
## 8 Klaron (Sodium Acne
## 9 Ortho Tri-Cyclen Acne
## 10 Retin-A Micro Acne
## # ... with 1,871 more rows
The drug_disease_data contains 1881 observations and 3 variables:
drug: the drug namedisease: the disease name#A: join datasets by ingredient for ingredient and products (left join)
#products = longer dataset + ingredient
product_ingredient = left_join(products, ingredient, by = "ingredient")
product_ingredient
## # A tibble: 40,443 x 20
## ingredient doseform route
## <chr> <chr> <chr>
## 1 ABACAVIR SULFATE SOLUTION ORAL
## 2 ABACAVIR SULFATE SOLUTION ORAL
## 3 ABACAVIR SULFATE TABLET ORAL
## 4 ABACAVIR SULFATE TABLET ORAL
## 5 ABACAVIR SULFATE TABLET ORAL
## 6 ABACAVIR SULFATE TABLET ORAL
## 7 ABACAVIR SULFATE TABLET ORAL
## 8 ABACAVIR SULFATE TABLET ORAL
## 9 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE TABLET ORAL
## 10 ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE TABLET ORAL
## # ... with 40,433 more rows, and 17 more variables: trade_name <chr>,
## # applicant <chr>, strength <chr>, drug_type <chr>, appl_no <int>,
## # product_no <int>, tecodestr <chr>, tecode <int>, approval_month <chr>,
## # approval_day <chr>, approval_year <chr>, otc <fctr>, appfull <chr>,
## # ref_listed_drug <fctr>, ref_std <fctr>, singnum <int>,
## # singredient <chr>
#B: exclusivity and patents left join (more~less)
patent_exclusivity = left_join(patents, exclusivity, by = c("appl_no", "product_no", "drug_type"))
patent_exclusivity
## # A tibble: 22,379 x 18
## drug_type appl_no product_no patent_num pediatric_exclusivity_granted
## <chr> <int> <int> <chr> <fctr>
## 1 B 22425 1 5223510 No
## 2 B 22425 1 5223510 No
## 3 B 22307 1 5288726 No
## 4 B 22307 1 5288726 No
## 5 B 22307 1 5288726 Yes
## 6 B 22307 1 5288726 Yes
## 7 B 21462 1 5344932 No
## 8 B 21462 1 5344932 Yes
## 9 B 22088 1 5362718 No
## 10 B 22088 1 5362718 Yes
## # ... with 22,369 more rows, and 13 more variables:
## # month_pat_expires <chr>, day_pat_expires <chr>,
## # year_pat_expires <chr>, pat_use_code <chr>, delist <chr>,
## # drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## # patent_delisted <fctr>, exclcode <chr>, excldatestr <chr>,
## # month_excl_expires <chr>, date_excl_expires <chr>,
## # year_excl_expires <chr>
#C: join A and B left join: "Orange Book." Thursday: meet to eliminate variables (inner join: only commonalities are preserved) #eliminate drugs that do not match
orange_book = left_join(product_ingredient, patent_exclusivity, by = c("appl_no", "product_no", "drug_type"))
orange_book = orange_book %>%
mutate(ingredient = str_to_title(ingredient),
doseform = str_to_title(doseform),
route = str_to_title(route),
trade_name = str_to_title(trade_name),
applicant = str_to_title(applicant),
otc = str_to_title(otc),
appfull = str_to_title(appfull),
singredient = str_to_title(singredient),
strength = str_to_title(strength))
orange_book
## # A tibble: 64,423 x 35
## ingredient doseform route trade_name applicant
## <chr> <chr> <chr> <chr> <chr>
## 1 Abacavir Sulfate Solution Oral Abacavir Sulfate Hetero Labs Ltd Iii
## 2 Abacavir Sulfate Solution Oral Ziagen Viiv Hlthcare
## 3 Abacavir Sulfate Solution Oral Ziagen Viiv Hlthcare
## 4 Abacavir Sulfate Solution Oral Ziagen Viiv Hlthcare
## 5 Abacavir Sulfate Solution Oral Ziagen Viiv Hlthcare
## 6 Abacavir Sulfate Tablet Oral Abacavir Sulfate Apotex Inc
## 7 Abacavir Sulfate Tablet Oral Abacavir Sulfate Aurobindo Pharma Ltd
## 8 Abacavir Sulfate Tablet Oral Abacavir Sulfate Hetero Labs Ltd Iii
## 9 Abacavir Sulfate Tablet Oral Abacavir Sulfate Mylan Pharms Inc
## 10 Abacavir Sulfate Tablet Oral Abacavir Sulfate Strides Pharma
## # ... with 64,413 more rows, and 30 more variables: strength <chr>,
## # drug_type <chr>, appl_no <int>, product_no <int>, tecodestr <chr>,
## # tecode <int>, approval_month <chr>, approval_day <chr>,
## # approval_year <chr>, otc <chr>, appfull <chr>, ref_listed_drug <fctr>,
## # ref_std <fctr>, singnum <int>, singredient <chr>, patent_num <chr>,
## # pediatric_exclusivity_granted <fctr>, month_pat_expires <chr>,
## # day_pat_expires <chr>, year_pat_expires <chr>, pat_use_code <chr>,
## # delist <chr>, drug_substance_flag <fctr>, drug_prod_flag <fctr>,
## # patent_delisted <fctr>, exclcode <chr>, excldatestr <chr>,
## # month_excl_expires <chr>, date_excl_expires <chr>,
## # year_excl_expires <chr>
# final goal: disease, funding, drug prices, and Orange Book
# eliminate unnecessary variables from orange book dataset
orange_book_clean = orange_book %>%
rename(application_no = appl_no,
active_ingredient = singredient,
dose_form = doseform) %>%
select(-c(ingredient, applicant, product_no, tecodestr, tecode, approval_day, ref_std, singnum, day_pat_expires, pat_use_code, drug_substance_flag, drug_prod_flag, exclcode, excldatestr, date_excl_expires)) %>%
select(trade_name, everything()) %>%
separate(dose_form, into = c("dose_form", "remove")) %>%
filter(is.na(remove)) %>%
select(-remove) %>%
filter(str_detect(dose_form, regex("Tablet", ignore_case = TRUE))) %>%
mutate(trade_name = trimws(trade_name, which = c("both")),
trade_name = str_replace_all(trade_name, ", ", "-"),
trade_name = str_replace_all(trade_name, " And ", "-"))
## Warning: Too many values at 8293 locations: 78, 79, 80, 81, 82, 153, 154,
## 155, 156, 157, 158, 159, 634, 635, 636, 1085, 1086, 1087, 1088, 1089, ...
## Warning: Too few values at 51953 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
# inner join of disease drug and orange book =y by trade_name
final_dataset =
new_drug_price %>%
inner_join(., orange_book_clean, by = c("trade_name", "drug_type", "otc")) %>%
distinct() %>%
left_join(., drug_disease_data, by = "trade_name") %>%
distinct() %>%
separate(disease, into = c("disease", "remove"), sep = " ") %>%
left_join(., final_NIH_funding, by = c("disease", "year")) %>%
as_tibble() %>%
select(-c(remove, remove.x)) %>%
distinct()
## Warning: Column `otc` joining factor and character vector, coercing into
## character vector
## Warning: Too many values at 11500 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
## Warning: Too few values at 4247 locations: 1366, 1368, 1370, 1372, 1374,
## 1376, 1459, 1460, 1462, 1463, 1465, 1466, 1468, 1469, 1471, 1472, 1474,
## 1475, 1477, 1478, ...
Josie
# Remomve some unecessary datasets
rm(patent_exclusivity, product_ingredient)
Manali
#do rare diseases recieve less funding?
#set.seed(1)
#final_dataset %>%
# sample_n(50)%>%
#plot_ly(x~funding, y~prevalence, type = "scatter")
#does funding correlate with drug prices?
#final_dataset %>%
# sample_n(50)%>%
# plot_ly(x ~ price, y ~ funding, type = "scatter")
We created a spaghetti plot of the average price of drugs per year for each drug. We only used data on prices per drug unit (mainly per tablet). Information about the disease that the drugs treat can be seen by hovering over the plot. We notice that of the drugs with information on price per unit drug, the drugs for Hepatitis seem to have the highest cost. Unfortunately we cannot tell which kind of hepatitis these drugs treat.
Possible case studies on most popular (perceived) drugs: Lipitor, Xanax, Celebrex, Prozac, Crestor, Nexium, Lisinopril, Zoloft
final_dataset %>%
filter(pricing_unit == "EA" & !is.na(disease)) %>%
plot_ly(x = ~year, y = ~avg_price, type = "scatter", mode = "lines", text = ~paste("Drug: ", trade_name, '<br>Disease:', disease), color = ~trade_name) %>%
layout(showlegend = FALSE)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
We also created a scatter plot of the mean NIH funding per drug (assumed to be the same as the NIH funding for disease research) against the demand of the drug (defined as the 2015 disease prevalence) colored by disease. Among drugs with available information on price per unit drug, obesity seems to have received the most funding on average and to have the highest 2015 prevalence.
final_dataset %>%
filter(pricing_unit == "EA" & !is.na(funding) & !is.na(prevalence)) %>%
group_by(trade_name, disease, prevalence) %>%
summarize(mean_fund = mean(funding)) %>%
plot_ly(x = ~prevalence, y = ~mean_fund, type = "scatter", alpha = 1, text = ~paste("Drug: ", trade_name, '<br>Disease:', disease), color = ~disease) %>%
layout(showlegend = FALSE)
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
MeOak